Competeing in the 2021 DSAC DVA competition!
Package Setup:
library(tidyverse)
library(janitor)
library(lubridate)
library(forcats)
library(gganimate)
Task: Create a plot using the folowing datasets…
Christmas songs in the Billboard top 100 list during December from 1958 to 2017 (christmas_billboard_data.csv)
Weather in Chicago on Christmas day from 1871 to 2018 (ChicagoWeatherChristmas.csv)
The gifts and the quantity of gifts acquired each day in “12 days of Christmas” (12_Days_of_Christmas.csv)
Here are the datasets:
weather <- read_csv(file="datasets/ChicagoWeatherChristmas.csv") %>% clean_names()
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Year = col_double(),
## `Fahrenheit High Temp` = col_double(),
## `Fahrenheit Min Temp` = col_double(),
## Precipitation = col_double(),
## Snow = col_double(),
## `White Christmas` = col_character(),
## `Celcius Min Temp` = col_double(),
## `Celcius High Temp` = col_double()
## )
songs <- read_csv("datasets/christmas_billboard_data.csv") %>% clean_names()
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## url = col_character(),
## weekid = col_character(),
## week_position = col_double(),
## song = col_character(),
## performer = col_character(),
## songid = col_character(),
## instance = col_double(),
## previous_week_position = col_double(),
## peak_position = col_double(),
## weeks_on_chart = col_double(),
## year = col_double(),
## month = col_double(),
## day = col_double()
## )
It should be easier to visualize this data by decade instead of each individual year.
After that I want to do things like:
total up the christmas snow per decade
total up the number of white christmas’s per decade
make an apporpirately themed ggplot!
weather_defined <- subset(weather, white_christmas!="Not Defined") %>%
mutate(decade= year-(year %% 10))
weather_defined$white_christmas <- as.logical(weather_defined$white_christmas)
white_xmas <- weather_defined %>% group_by(decade) %>%
summarize(n = n(),
white_xmases=sum(white_christmas),
total_snow=sum(snow),
total_precip = sum(precipitation),
percent_white =white_xmases/(white_xmases+n),
ave_snow = mean(snow),
sd_snow = sd(snow)
)
white_xmas
## # A tibble: 14 x 8
## decade n white_xmases total_snow total_precip percent_white ave_snow
## <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1880 4 2 3.3 0.3 0.333 0.825
## 2 1890 6 1 0.3 NA 0.143 0.05
## 3 1900 5 2 8.2 0.92 0.286 1.64
## 4 1910 9 3 1.8 0.19 0.25 0.2
## 5 1920 8 4 1.2 0.23 0.333 0.15
## 6 1930 9 3 8.7 NA 0.25 0.967
## 7 1940 8 5 3.8 0.75 0.385 0.475
## 8 1950 7 3 9.4 NA 0.3 1.34
## 9 1960 7 4 6.7 0.73 0.364 0.957
## 10 1970 6 4 2.7 0.51 0.4 0.45
## 11 1980 6 1 0.5 0.5 0.143 0.0833
## 12 1990 8 2 0.7 NA 0.2 0.0875
## 13 2000 8 4 2.9 NA 0.333 0.362
## 14 2010 9 4 2.3 0.24 0.308 0.256
## # … with 1 more variable: sd_snow <dbl>
Now let’s plot it in a festive way!
ggplot(white_xmas, aes(x=decade, y=total_snow)) + geom_line() + geom_point(aes(size=n), color="white") + theme_minimal() + scale_x_continuous(breaks=unique(white_xmas$decade)) + labs(x="Decade", y="Total Chirstmas Snowfall (inches)", size="# White Christmases") +
ggtitle("It Still Snows on Christmas in Chicago", subtitle = "...just not as much as it used to") +
theme(plot.background = element_rect(fill="lightsteelblue3"),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(linetype="dashed"),
text = element_text(colour="white"),
axis.text = element_text(colour="white")
)
ggsave("StillSnows.jpg", height=5, width=8)
songs$song <- factor(songs$song)
One interesting metric is how far away from the actual date of Christmas each of these weeks are. Let’s normalize the time component to 12/25 and see what it looks like.
songs$week_date <- mdy(songs$weekid)
#Set all years to 2000 or 2001 if the month is January
for (i in 1:length(songs$week_date)){
if (month(songs$week_date[i])==1){
year(songs$week_date[i]) <- 2001
}
else {
year(songs$week_date[i]) <- 2000
}
}
xmas <- mdy("12-25-2000")
songs <- mutate(songs, xmas_dist = as.numeric(difftime(week_date, xmas, units="days")))
summary(songs$xmas_dist)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -50.000 -7.500 3.000 2.364 12.000 37.000
Now summarize by decade again to decluter the plot
songs <- mutate(songs, decade= year-(year %% 10))
songs$decade <- factor(songs$decade, ordered = T)
summary(songs$decade)
## 1950 1960 1970 1980 1990 2000 2010
## 21 144 44 35 24 50 69
Interesting in itself that there are so many more in the 1960s than any other decade
Below I am doing a few things. First, I want to calculate a single score to measure the “hit potential” of each song over these holiday months. I call this the “hit score”, and it is calculated by taking (100/peak_position) * weeks on the charts.
The higher the hit score, the higher and longer a particular song was on the charts for.
hit_score <- songs %>% group_by(performer, songid) %>%
summarise(peak_position=mean(peak_position),
weeks=mean(weeks_on_chart),
hit_score=(100/peak_position)*weeks) %>%
arrange(desc(hit_score)) %>%
ungroup()
## `summarise()` has grouped output by 'performer'. You can override using the `.groups` argument.
songs2 <- left_join(songs, select(hit_score, songid, hit_score), by="songid")
#Need to fix the one repeat with EXACT same hit score
songs2[songs2$songid=="BelieveBrooks & Dunn",]$hit_score <- 34
Now I want to rank each of the songs by hit_score based on the year and ALL previous years. This way we can see which decades generated the most chart-topping songs
songs_list <- list()
years <- unique(songs2$year)
for (i in 1:length(years)) {
songs_list[[i]] <- subset(songs2, year <=years[i]) %>%
ungroup() %>%
distinct(songid, .keep_all=T) %>%
#group_by(songid) %>%
mutate(rank=rank(-hit_score),
rank_year = years[[i]]) %>%
subset(rank<25) %>%
select(songid, performer, song, hit_score, year,decade,rank, rank_year)
}
songs_big_df <- Reduce(
function(x, y, ...) merge(x, y, all = TRUE, ...),
songs_list
)
songs_big_df <- mutate(songs_big_df,
label = paste(song, performer, year, sep=" "))
And animate it all over time!
color_pal <- c("1950"="#3C4930", "1960"="#8AAEE2", "1970"="#A6001D",
"1980"="#D00016","1990"="#D7BA5C",
"2000"="#BF5E73","2010"="#AF8952")
animation <- ggplot(songs_big_df, aes(rank, group = songid,
fill = as.factor(decade), color = as.factor(decade))) +
geom_tile(aes(y = hit_score/2,
height = hit_score,
width = 0.9), alpha = 0.8, color = NA) +
geom_text(aes(y = 0, label = paste(label, " ")), vjust = 0.1, hjust = -0.2, color="black") +
coord_flip(clip = "off", expand = FALSE) +
scale_x_reverse() +
scale_fill_manual(values=color_pal) +
guides(color = FALSE, fill = FALSE) +
labs(y="Hit Score (peak position * weeks on chart)") +
theme(axis.line=element_blank(),
#text = element_text(family = "Varela Round"),
#axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
#axis.title.x=element_blank(),
axis.title.y=element_blank(),
legend.position="none",
panel.background=element_blank(),
panel.border=element_blank(),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
panel.grid.major.x = element_line( size=.1, color="grey" ),
panel.grid.minor.x = element_line( size=.1, color="grey" ),
plot.title=element_text(size=25,
#hjust=0.5,
face="bold", colour="firebrick",
#vjust=-1
),
plot.subtitle=element_text(size=18,
#hjust=0.5,
face="italic",
color="darkgreen"),
plot.caption =element_text(size=8,
#hjust=0.5,
face="italic", color="darkgrey"),
plot.background=element_rect(fill="#FFEDE1"
#"#DDE4F8"
),
#plot.margin = margin(2,2, 2, 4, "cm")
) +
transition_states(rank_year, transition_length = 4, state_length = 1) +
enter_fade() +
exit_fade() +
labs(title = 'Top 25 Holiday Hits Over the Years',
subtitle = 'Year: {closest_state}',
caption = "Data: Hot 100 singles chart from Billboard.com")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
#animation
animate(animation, duration=30,fps=20, width=450, height=600,
renderer=gifski_renderer("holidayhits.gif"))
top_song <- subset(songs, songid=="This One's For The ChildrenNew Kids On The Block"|
songid =="MistletoeJustin Bieber" |
songid == "AmenThe Impressions" |
songid == "All I Want For Christmas Is YouMariah Carey" |
songid =="Same Old Lang SyneDan Fogelberg") %>%
mutate(label = paste(song, performer, sep=" "))
top_song <- top_song %>% group_by(song)
ggplot(top_song, aes(x=xmas_dist, y=week_position, group=song)) + geom_line(aes(color=label)) + scale_y_reverse() + theme_minimal() +
labs(x="Days to Christmas", y = "Chart Position") +
transition_reveal(xmas_dist, )
This project was fun! I learned a few things along the way…namely:
Working with lubridate and date intervals
How to animate ggplots using gganimate
How to floor dates to decades